{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 4: Data Transformation Techniques"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simple aggregations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from polars import selectors as cs\n",
    "(\n",
    "    df\n",
    "    .select(cs.numeric())\n",
    "    .sum()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = df.select('Quantity').to_series()\n",
    "s.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col('Quantity').sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(\n",
    "    pl.col('Customer Name').first().alias('Cust Name First'),\n",
    "    pl.col('Customer Name').last().alias('Cust Name Last')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(cs.numeric()).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There is more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(\n",
    "    (pl.col('Quantity') >= 4).sum()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(\n",
    "    pl.col('Quantity').filter(pl.col('Store Name')=='Online store').sum()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using group by aggregations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.group_by('Brand')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .group_by('Brand')\n",
    "    .agg(pl.col('Quantity').sum().alias('Sum of Quantity'))\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .group_by('Brand')\n",
    "    .agg(\n",
    "        pl.col('Unit Price').sum().alias('Sum of Quantity'),\n",
    "        pl.col('Unit Price').mean().alias('Average Unit Price'),\n",
    "    )\n",
    "    .sort('Average Unit Price', descending=True)\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .group_by('Brand')\n",
    "    .agg(\n",
    "        pl.col('Unit Price').mean().round(2).alias('Average Unit Price'),\n",
    "        (pl.col('Unit Price').sum() / pl.len()).round(2).alias('Average Unit Price 2'),\n",
    "        pl.col('Customer Name').first(),\n",
    "        pl.col('Category').last()\n",
    "    )\n",
    "    .sort('Average Unit Price', descending=True)\n",
    "    .sort('Brand')\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    pl.scan_csv('../data/contoso_sales.csv', try_parse_dates=True)\n",
    "    .group_by('Brand')\n",
    "    .agg(\n",
    "        pl.col('Unit Price').mean().round(2).alias('Average Unit Price'),\n",
    "        (pl.col('Unit Price').sum() / pl.len()).round(2).alias('Average Unit Price 2'),\n",
    "        pl.col('Customer Name').first(),\n",
    "        pl.col('Category').last()\n",
    "    )\n",
    "    .sort('Average Unit Price', descending=True)\n",
    "    .sort('Brand')\n",
    "    .collect()\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl.Config.set_fmt_str_lengths = 50\n",
    "print(df.select('Brand').unique().head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select('Brand').unique().head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['POLARS_FMT_STR_LEN'] = str(50)\n",
    "\n",
    "df.select('Brand').unique().head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There is more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name, data in df.group_by(['Brand']):\n",
    "    print(name[0], type(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .group_by('Brand')\n",
    "    .agg(pl.col('Quantity'))\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .group_by('Brand', maintain_order=True)\n",
    "    .agg(pl.col('Quantity'))\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .group_by(\n",
    "        pl.col('Brand'), \n",
    "        'Customer Country',\n",
    "        pl.col('Order Date').dt.year().alias('Order Year')\n",
    "        )\n",
    "    .agg(pl.col('Unit Price').mean())\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregating values across multiple columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_csv('../data/pokemon.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .select('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed')\n",
    "    .sum_horizontal().alias('Total 2')\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .with_columns(\n",
    "        pl.sum_horizontal('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed').alias('Total 2')\n",
    "    )\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .with_columns(\n",
    "        pl.concat_list('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed').list.sum().alias('Total 2')\n",
    "    )\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n",
    "(\n",
    "    df\n",
    "    .with_columns(\n",
    "        pl.reduce(\n",
    "            function=lambda acc, col: acc + col, \n",
    "            exprs=pl.col(cols)\n",
    "        )\n",
    "        .alias('Total 2')\n",
    "    )\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .with_columns(\n",
    "        pl.fold(\n",
    "            acc=pl.lit(100), \n",
    "            function=lambda acc, col: acc + col, \n",
    "            exprs=pl.col(cols)\n",
    "        )\n",
    "        .alias('Total 2')\n",
    "    )\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There is more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .filter(\n",
    "        pl.fold(\n",
    "            acc=pl.lit(True), \n",
    "            function=lambda acc, col: acc & col, \n",
    "            exprs=pl.col(cols) > 80\n",
    "        )\n",
    "    )\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .filter(\n",
    "        pl.all_horizontal(pl.col(cols) > 80)\n",
    "    )\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "str_cols = ['Name', 'Type 1', 'Type 2']\n",
    "str_combined = pl.fold(acc=pl.lit(''), function=lambda acc, col: acc + col, exprs=str_cols).alias('Str Combined')\n",
    "str_cols.append(str_combined)\n",
    "df.select(str_cols).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "str_cols = ['Name', 'Type 1', 'Type 2']\n",
    "df.select(pl.concat_str(str_cols)).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Computing over groups with window functions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "os.environ['POLARS_FMT_STR_LEN'] = str(50) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)\n",
    "df = df.with_columns(\n",
    "    (pl.col('Quantity') * pl.col('Net Price')).round(2).alias('Sales Amount')\n",
    ")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sales_by_cat = df.select(\n",
    "    'Category',\n",
    "    'Subcategory',\n",
    "    pl.col('Sales Amount').sum().over('Category').alias('Sales Amt per Cat')\n",
    ")\n",
    "sales_by_cat.head()    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sales_by_cat.filter(pl.col('Category')=='Audio').unique().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape, sales_by_cat.shape "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .select(\n",
    "        'Category',\n",
    "        'Brand',\n",
    "        'Subcategory',\n",
    "        pl.col('Sales Amount').mean().over('Category', 'Brand').alias('Avg Sales per Cat and Brand')\n",
    "    )\n",
    "    .filter(\n",
    "        (pl.col('Category')=='Computers') \n",
    "    )\n",
    "    .unique()\n",
    "    .sort('Brand')\n",
    "    .head(10)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import date\n",
    "\n",
    "curr_yr = date.today().year\n",
    "cust_birth_yr = curr_yr - pl.col('Customer Age')\n",
    "\n",
    "(\n",
    "    df\n",
    "    .select(\n",
    "        'Category',\n",
    "        'Brand',\n",
    "        'Customer Age',\n",
    "        pl.col('Sales Amount').mean().over('Category', cust_birth_yr).alias('Avg Sales per Cat') \n",
    "    )\n",
    "    .filter(pl.col('Category')=='Computers')\n",
    "    .unique()\n",
    "    .sort('Customer Age')\n",
    "    .head(10)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .group_by('Category')\n",
    "    .agg(pl.col('Sales Amount').max().alias('Max Sales Amt'))\n",
    "    .with_columns(\n",
    "        pl.col('Max Sales Amt').rank(descending=True).alias('Rank')\n",
    "    )\n",
    "    .sort('Rank')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    df\n",
    "    .group_by('Category', 'Subcategory')\n",
    "    .agg(pl.col('Sales Amount').max().round().cast(pl.Int64).alias('Max Sales Amt'))\n",
    "    .with_columns(\n",
    "        pl.col('Max Sales Amt').rank(descending=True).over('Category').cast(pl.Int64).alias('Rank')\n",
    "    )\n",
    "    .filter(pl.col('Category').is_in(['Audio', 'Computers']))\n",
    "    .sort(['Category', 'Rank'])\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There is more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_sales_rank = (\n",
    "    df\n",
    "    .group_by('Category', 'Subcategory')\n",
    "    .agg(pl.col('Sales Amount').max().round().cast(pl.Int64).alias('Max Sales Amt'))\n",
    "    .with_columns(\n",
    "        pl.col('Max Sales Amt').rank(descending=True).over('Category').cast(pl.Int64).alias('Rank')\n",
    "    )\n",
    "    .filter(pl.col('Category').is_in(['Audio', 'Computers']))\n",
    "    .sort(['Category', 'Rank'])\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_sales_rank.with_columns(\n",
    "    pl.col('Subcategory')\n",
    "    .sort_by('Max Sales Amt')\n",
    "    .head(3)\n",
    "    .over('Category', mapping_strategy='join')\n",
    "    .alias('Lowest 3 Subcat per Cat')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_sales_rank.with_columns(\n",
    "    pl.col('Subcategory')\n",
    "    .sort_by('Max Sales Amt')\n",
    "    .over('Category', mapping_strategy='explode')\n",
    "    .alias('Subcategory Sorted by Max Sales Amt Ascending')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    max_sales_rank\n",
    "    .sort('Subcategory')\n",
    "    .with_columns(\n",
    "        pl.col('Subcategory')\n",
    "        .sort_by('Max Sales Amt')\n",
    "        .over('Category', mapping_strategy='explode')\n",
    "        .alias('Subcategory Sorted by Max Sales Amt Ascending')\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Applying UDFs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_first_name(full_name: str) -> str:\n",
    "    return full_name.split(' ')[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(\n",
    "    'Customer Name',\n",
    "    pl.col('Customer Name').map_elements(lambda el: get_first_name(el), return_dtype=pl.String).alias('Customer First Name')\n",
    ").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(\n",
    "    'Customer Name',\n",
    "    pl.col('Customer Name').map_elements(lambda el: el.split(' ')[0], return_dtype=pl.String).alias('Customer First Name')\n",
    ").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def age_to_range(age: int) -> str:\n",
    "    if age < 18:\n",
    "        return '~17'\n",
    "    elif age <= 30:\n",
    "        return '18~30'\n",
    "    elif age <= 50:\n",
    "        return '31~50'\n",
    "    elif age <= 70:\n",
    "        return '50~70'\n",
    "    else:\n",
    "        return '71~'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(\n",
    "    'Customer Age',\n",
    "    pl.col('Customer Age').map_elements(lambda el: age_to_range(el), return_dtype=pl.String).alias('Age Range')\n",
    ").head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There is more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(\n",
    "    'Customer Name',\n",
    "    pl.col('Customer Name').str.split(' ').list.first().alias('Customer First Name')\n",
    ").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(\n",
    "    'Customer Age',\n",
    "    pl.when(pl.col('Customer Age')<18).then(pl.lit('~17'))\n",
    "    .when(pl.col('Customer Age')<=30).then(pl.lit('18~30'))\n",
    "    .when(pl.col('Customer Age')<=50).then(pl.lit('31~50'))\n",
    "    .when(pl.col('Customer Age')<=70).then(pl.lit('51~70'))\n",
    "    .when(pl.col('Customer Age')>70).then(pl.lit('71~'))\n",
    "    .alias('Age Range')\n",
    ").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "df.select(\n",
    "    'Customer Name',\n",
    "    pl.col('Customer Name').map_elements(lambda el: el.split(' ')[0], return_dtype=pl.String).alias('Customer First Name')\n",
    ").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "df.select(\n",
    "    'Customer Name',\n",
    "    pl.col('Customer Name').str.split(' ').list.first().alias('Customer First Name')\n",
    ").head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using SQL for data transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (5, 20)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Order Number</th><th>Line Number</th><th>Order Date</th><th>Delivery Date</th><th>Customer Name</th><th>Customer Gender</th><th>Customer Country</th><th>Customer Age</th><th>Store Name</th><th>Product Name</th><th>Color</th><th>Brand</th><th>Category</th><th>Subcategory</th><th>Quantity</th><th>Unit Price</th><th>Net Price</th><th>Unit Cost</th><th>Currency Code</th><th>Exchange Rate</th></tr><tr><td>i64</td><td>i64</td><td>date</td><td>date</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>f64</td><td>f64</td><td>f64</td><td>str</td><td>f64</td></tr></thead><tbody><tr><td>284806</td><td>1</td><td>2017-10-18</td><td>2017-10-20</td><td>&quot;Eric Kennedy&quot;</td><td>&quot;Male&quot;</td><td>&quot;United States&quot;</td><td>47</td><td>&quot;Online store&quot;</td><td>&quot;Contoso 512MB …</td><td>&quot;Silver&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td><td>&quot;MP4&amp;MP3&quot;</td><td>7</td><td>11.691</td><td>10.288</td><td>5.958</td><td>&quot;USD&quot;</td><td>1.0</td></tr><tr><td>285506</td><td>1</td><td>2017-10-25</td><td>2017-10-26</td><td>&quot;George Tooth&quot;</td><td>&quot;Male&quot;</td><td>&quot;Australia&quot;</td><td>30</td><td>&quot;Online store&quot;</td><td>&quot;Contoso 512MB …</td><td>&quot;Silver&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td><td>&quot;MP4&amp;MP3&quot;</td><td>1</td><td>11.691</td><td>11.691</td><td>5.958</td><td>&quot;AUD&quot;</td><td>1.2967</td></tr><tr><td>311002</td><td>2</td><td>2018-07-07</td><td>2018-07-12</td><td>&quot;Caleb Greene&quot;</td><td>&quot;Male&quot;</td><td>&quot;Australia&quot;</td><td>59</td><td>&quot;Online store&quot;</td><td>&quot;Contoso 512MB …</td><td>&quot;Silver&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td><td>&quot;MP4&amp;MP3&quot;</td><td>6</td><td>12.99</td><td>12.99</td><td>6.62</td><td>&quot;AUD&quot;</td><td>1.3484</td></tr><tr><td>366307</td><td>2</td><td>2020-01-11</td><td>2020-01-11</td><td>&quot;Isaac Siddins&quot;</td><td>&quot;Male&quot;</td><td>&quot;Australia&quot;</td><td>25</td><td>&quot;Contoso Store …</td><td>&quot;Contoso 512MB …</td><td>&quot;Blue&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td><td>&quot;MP4&amp;MP3&quot;</td><td>4</td><td>12.99</td><td>12.99</td><td>6.62</td><td>&quot;AUD&quot;</td><td>1.4545</td></tr><tr><td>325708</td><td>3</td><td>2018-12-01</td><td>2018-12-02</td><td>&quot;Mike McQueen&quot;</td><td>&quot;Male&quot;</td><td>&quot;United States&quot;</td><td>56</td><td>&quot;Online store&quot;</td><td>&quot;Contoso 512MB …</td><td>&quot;Blue&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td><td>&quot;MP4&amp;MP3&quot;</td><td>2</td><td>12.99</td><td>11.5611</td><td>6.62</td><td>&quot;USD&quot;</td><td>1.0</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (5, 20)\n",
       "┌────────┬────────┬─────────────┬─────────────┬───┬─────────────┬───────┬─────────────┬────────────┐\n",
       "│ Order  ┆ Line   ┆ Order Date  ┆ Delivery    ┆ … ┆ Net Price   ┆ Unit  ┆ Currency    ┆ Exchange   │\n",
       "│ Number ┆ Number ┆ ---         ┆ Date        ┆   ┆ ---         ┆ Cost  ┆ Code        ┆ Rate       │\n",
       "│ ---    ┆ ---    ┆ date        ┆ ---         ┆   ┆ f64         ┆ ---   ┆ ---         ┆ ---        │\n",
       "│ i64    ┆ i64    ┆             ┆ date        ┆   ┆             ┆ f64   ┆ str         ┆ f64        │\n",
       "╞════════╪════════╪═════════════╪═════════════╪═══╪═════════════╪═══════╪═════════════╪════════════╡\n",
       "│ 284806 ┆ 1      ┆ 2017-10-18  ┆ 2017-10-20  ┆ … ┆ 10.288      ┆ 5.958 ┆ USD         ┆ 1.0        │\n",
       "│ 285506 ┆ 1      ┆ 2017-10-25  ┆ 2017-10-26  ┆ … ┆ 11.691      ┆ 5.958 ┆ AUD         ┆ 1.2967     │\n",
       "│ 311002 ┆ 2      ┆ 2018-07-07  ┆ 2018-07-12  ┆ … ┆ 12.99       ┆ 6.62  ┆ AUD         ┆ 1.3484     │\n",
       "│ 366307 ┆ 2      ┆ 2020-01-11  ┆ 2020-01-11  ┆ … ┆ 12.99       ┆ 6.62  ┆ AUD         ┆ 1.4545     │\n",
       "│ 325708 ┆ 3      ┆ 2018-12-01  ┆ 2018-12-02  ┆ … ┆ 11.5611     ┆ 6.62  ┆ USD         ┆ 1.0        │\n",
       "└────────┴────────┴─────────────┴─────────────┴───┴─────────────┴───────┴─────────────┴────────────┘"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import polars as pl\n",
    "df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (5, 3)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Customer Name</th><th>Brand</th><th>Category</th></tr><tr><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;Eric Kennedy&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr><tr><td>&quot;George Tooth&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr><tr><td>&quot;Caleb Greene&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr><tr><td>&quot;Isaac Siddins&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr><tr><td>&quot;Mike McQueen&quot;</td><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (5, 3)\n",
       "┌───────────────┬─────────┬──────────┐\n",
       "│ Customer Name ┆ Brand   ┆ Category │\n",
       "│ ---           ┆ ---     ┆ ---      │\n",
       "│ str           ┆ str     ┆ str      │\n",
       "╞═══════════════╪═════════╪══════════╡\n",
       "│ Eric Kennedy  ┆ Contoso ┆ Audio    │\n",
       "│ George Tooth  ┆ Contoso ┆ Audio    │\n",
       "│ Caleb Greene  ┆ Contoso ┆ Audio    │\n",
       "│ Isaac Siddins ┆ Contoso ┆ Audio    │\n",
       "│ Mike McQueen  ┆ Contoso ┆ Audio    │\n",
       "└───────────────┴─────────┴──────────┘"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ctx = pl.SQLContext(eager=True)\n",
    "ctx.register('df', df)\n",
    "ctx.execute(\n",
    "    \"\"\"\n",
    "      select\n",
    "        `Customer Name`,\n",
    "        Brand,\n",
    "        Category\n",
    "      from df limit 5\n",
    "    \"\"\"\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (5, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Brand</th><th>Avg Quantity</th></tr><tr><td>str</td><td>f64</td></tr></thead><tbody><tr><td>&quot;Fabrikam&quot;</td><td>3.225532</td></tr><tr><td>&quot;Northwind Trad…</td><td>3.222222</td></tr><tr><td>&quot;Wide World Imp…</td><td>3.193811</td></tr><tr><td>&quot;Fabrikam  &quot;</td><td>3.192308</td></tr><tr><td>&quot;Southridge Vid…</td><td>3.189509</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (5, 2)\n",
       "┌──────────────────────┬──────────────┐\n",
       "│ Brand                ┆ Avg Quantity │\n",
       "│ ---                  ┆ ---          │\n",
       "│ str                  ┆ f64          │\n",
       "╞══════════════════════╪══════════════╡\n",
       "│ Fabrikam             ┆ 3.225532     │\n",
       "│ Northwind Traders    ┆ 3.222222     │\n",
       "│ Wide World Importers ┆ 3.193811     │\n",
       "│ Fabrikam             ┆ 3.192308     │\n",
       "│ Southridge Video     ┆ 3.189509     │\n",
       "└──────────────────────┴──────────────┘"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ctx.execute(\n",
    "    \"\"\"\n",
    "      select\n",
    "        Brand,\n",
    "        avg(Quantity) as `Avg Quantity` \n",
    "      from df\n",
    "      group by \n",
    "        Brand\n",
    "      order by \n",
    "        `Avg Quantity` desc\n",
    "      limit 5\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (5, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Brand</th><th>Category</th></tr><tr><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr><tr><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr><tr><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr><tr><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr><tr><td>&quot;Contoso&quot;</td><td>&quot;Audio&quot;</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (5, 2)\n",
       "┌─────────┬──────────┐\n",
       "│ Brand   ┆ Category │\n",
       "│ ---     ┆ ---      │\n",
       "│ str     ┆ str      │\n",
       "╞═════════╪══════════╡\n",
       "│ Contoso ┆ Audio    │\n",
       "│ Contoso ┆ Audio    │\n",
       "│ Contoso ┆ Audio    │\n",
       "│ Contoso ┆ Audio    │\n",
       "│ Contoso ┆ Audio    │\n",
       "└─────────┴──────────┘"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl.SQLContext(lf=df.lazy()).execute(\n",
    "    \"\"\"\n",
    "        select \n",
    "            Brand,\n",
    "            Category\n",
    "        from lf\n",
    "        limit 5\n",
    "    \"\"\"\n",
    ").collect()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
